According to Wikipedia, Ford GoBike is a public bike sharing system in California's San Francisco Bay region. Initially known as Bay Wheels, Ford GoBike is the first regional and large-scale bike-sharing system deployed in California and the west coast of the United States. It was established as bay area bike share in August 2013. As of January 2018, the Bay Wheels system had more than 2,600 bikes at 262 stations in San Francisco, East Bay and San Jose.
In this study, data provided by the bike sharing program during the period of February 2019 will be analyzed. The data will be analyzed through an exploratory analysis and finally an explanatory analysis of the data will be made.
# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
from sklearn.cluster import KMeans
import matplotlib.cm as cm
from geopy.geocoders import Nominatim
import time
%matplotlib inline
df = pd.read_csv('201902-fordgobike-tripdata.csv')
def checkDataFrame(df,dfname = ''):
'''
This function will summary all details from Dataset like: Shape, Info and columns describe
'''
print('Dataframe Summary\n')
print(dfname)
print('='*100)
print('\tRows: {} Columns {}\n'.format(df.shape[0],df.shape[1]))
print('-'*100)
print(df.info(verbose=True))
print('-'*100)
print(df.describe())
print('-'*100)
for i in df.columns:
vcount = df[i].value_counts()
print(vcount)
print('-'*100)
print('Summary END')
print('='*100)
checkDataFrame(df,'fordgobike')
Dataframe Summary
fordgobike
====================================================================================================
Rows: 183412 Columns 16
----------------------------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183412 entries, 0 to 183411
Data columns (total 16 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 duration_sec 183412 non-null int64
1 start_time 183412 non-null object
2 end_time 183412 non-null object
3 start_station_id 183215 non-null float64
4 start_station_name 183215 non-null object
5 start_station_latitude 183412 non-null float64
6 start_station_longitude 183412 non-null float64
7 end_station_id 183215 non-null float64
8 end_station_name 183215 non-null object
9 end_station_latitude 183412 non-null float64
10 end_station_longitude 183412 non-null float64
11 bike_id 183412 non-null int64
12 user_type 183412 non-null object
13 member_birth_year 175147 non-null float64
14 member_gender 175147 non-null object
15 bike_share_for_all_trip 183412 non-null object
dtypes: float64(7), int64(2), object(7)
memory usage: 22.4+ MB
None
----------------------------------------------------------------------------------------------------
duration_sec start_station_id start_station_latitude \
count 183412.000000 183215.000000 183412.000000
mean 726.078435 138.590427 37.771223
std 1794.389780 111.778864 0.099581
min 61.000000 3.000000 37.317298
25% 325.000000 47.000000 37.770083
50% 514.000000 104.000000 37.780760
75% 796.000000 239.000000 37.797280
max 85444.000000 398.000000 37.880222
start_station_longitude end_station_id end_station_latitude \
count 183412.000000 183215.000000 183412.000000
mean -122.352664 136.249123 37.771427
std 0.117097 111.515131 0.099490
min -122.453704 3.000000 37.317298
25% -122.412408 44.000000 37.770407
50% -122.398285 100.000000 37.781010
75% -122.286533 235.000000 37.797320
max -121.874119 398.000000 37.880222
end_station_longitude bike_id member_birth_year
count 183412.000000 183412.000000 175147.000000
mean -122.352250 4472.906375 1984.806437
std 0.116673 1664.383394 10.116689
min -122.453704 11.000000 1878.000000
25% -122.411726 3777.000000 1980.000000
50% -122.398279 4958.000000 1987.000000
75% -122.288045 5502.000000 1992.000000
max -121.874119 6645.000000 2001.000000
----------------------------------------------------------------------------------------------------
272 311
324 292
323 291
306 291
305 290
...
5803 1
11870 1
5931 1
15964 1
5058 1
Name: duration_sec, Length: 4752, dtype: int64
----------------------------------------------------------------------------------------------------
2019-02-15 08:43:18.4220 2
2019-02-25 08:52:07.5820 2
2019-02-11 17:05:07.8400 2
2019-02-15 07:47:00.1970 2
2019-02-19 17:52:44.1750 2
..
2019-02-15 17:50:13.9340 1
2019-02-06 09:46:44.5560 1
2019-02-21 04:54:08.7390 1
2019-02-28 11:02:49.3980 1
2019-02-01 20:19:15.9460 1
Name: start_time, Length: 183401, dtype: int64
----------------------------------------------------------------------------------------------------
2019-02-12 09:07:04.4750 2
2019-02-14 08:20:06.2300 2
2019-02-28 17:40:37.3280 2
2019-02-11 18:53:55.0820 2
2019-02-05 17:48:45.8240 2
..
2019-02-23 00:20:08.0250 1
2019-02-22 22:34:46.6340 1
2019-02-26 17:19:12.1370 1
2019-02-12 10:50:48.6520 1
2019-02-20 10:41:43.8300 1
Name: end_time, Length: 183397, dtype: int64
----------------------------------------------------------------------------------------------------
58.0 3904
67.0 3544
81.0 3052
21.0 2895
3.0 2760
...
301.0 9
51.0 7
300.0 4
224.0 4
344.0 2
Name: start_station_id, Length: 329, dtype: int64
----------------------------------------------------------------------------------------------------
Market St at 10th St 3904
San Francisco Caltrain Station 2 (Townsend St at 4th St) 3544
Berry St at 4th St 3052
Montgomery St BART Station (Market St at 2nd St) 2895
Powell St BART Station (Market St at 4th St) 2760
...
Willow St at Vine St 9
Parker Ave at McAllister St 7
Palm St at Willow St 4
21st Ave at International Blvd 4
16th St Depot 2
Name: start_station_name, Length: 329, dtype: int64
----------------------------------------------------------------------------------------------------
37.776619 3904
37.776639 3544
37.775880 3052
37.789625 2895
37.786375 2760
...
37.776101 7
37.784855 4
37.317298 4
37.766349 2
37.380000 1
Name: start_station_latitude, Length: 334, dtype: int64
----------------------------------------------------------------------------------------------------
-122.417385 3904
-122.395526 3544
-122.393170 3052
-122.400811 2895
-122.404904 2760
...
-122.453093 7
-121.884995 4
-122.239305 4
-122.396292 2
-121.980000 1
Name: start_station_longitude, Length: 335, dtype: int64
----------------------------------------------------------------------------------------------------
67.0 4857
58.0 3973
21.0 3647
15.0 3368
3.0 2997
...
51.0 9
300.0 7
224.0 6
344.0 6
301.0 5
Name: end_station_id, Length: 329, dtype: int64
----------------------------------------------------------------------------------------------------
San Francisco Caltrain Station 2 (Townsend St at 4th St) 4857
Market St at 10th St 3973
Montgomery St BART Station (Market St at 2nd St) 3647
San Francisco Ferry Building (Harry Bridges Plaza) 3368
Powell St BART Station (Market St at 4th St) 2997
...
Parker Ave at McAllister St 9
Palm St at Willow St 7
21st Ave at International Blvd 6
16th St Depot 6
Willow St at Vine St 5
Name: end_station_name, Length: 329, dtype: int64
----------------------------------------------------------------------------------------------------
37.776639 4857
37.776619 3973
37.789625 3647
37.795392 3368
37.786375 2997
...
37.766349 6
37.784855 6
37.318450 5
37.380000 2
37.430000 1
Name: end_station_latitude, Length: 335, dtype: int64
----------------------------------------------------------------------------------------------------
-122.395526 4857
-122.417385 3973
-122.400811 3647
-122.394203 3368
-122.404904 2997
...
-121.884995 7
-122.239305 6
-122.396292 6
-121.883172 5
-121.980000 2
Name: end_station_longitude, Length: 335, dtype: int64
----------------------------------------------------------------------------------------------------
4794 191
4814 176
5014 174
4422 174
5175 173
...
5647 1
3655 1
1194 1
3114 1
5891 1
Name: bike_id, Length: 4646, dtype: int64
----------------------------------------------------------------------------------------------------
Subscriber 163544
Customer 19868
Name: user_type, dtype: int64
----------------------------------------------------------------------------------------------------
1988.0 10236
1993.0 9325
1989.0 8972
1990.0 8658
1991.0 8498
...
1930.0 1
1910.0 1
1927.0 1
1928.0 1
1878.0 1
Name: member_birth_year, Length: 75, dtype: int64
----------------------------------------------------------------------------------------------------
Male 130651
Female 40844
Other 3652
Name: member_gender, dtype: int64
----------------------------------------------------------------------------------------------------
No 166053
Yes 17359
Name: bike_share_for_all_trip, dtype: int64
----------------------------------------------------------------------------------------------------
Summary END
====================================================================================================
The data has 183412 rows of records and 16 columns of data. Some tables have null data that needs to be analyzed to decide whether to be treated or remove. egarding the type of data, it is observed that date and time variables need to be treated for the DateTime type. Fields that have some ID identifier need to be converted to String, and finally the Birthday Year variable deve der analisada, uma vez que foi identificado individuos should be analyzed, since it was identified individuals who have a date of birth of 1878 and therefore we should analyze the case. Finally, the data are from the period of February 2019
What interested me most in the data was the desire to find out how the data is distributed spatially and then build a real graphical representation of that distribution,we can use the LAT data, LONG for that.
In addition, I will try to find out which factors influence the duration of the trip in terms of date and time, age of users, point of departure and point of arrival and also in relation to the gender of the user.
The first part of my analysis will be important to evaluate the latitude and longitude data in relation to the following characteristics: the duration time, ages, gender and type of user. We can analyze these characteristics and see how they are distributed along the map. Regarding travel time I need to mainly evaluate start time information, station information and user characteristics.
clean_df = df.copy()
As stated earlier, we need to adjust the data type to ensure better data quality. The data for time and date, id-to-string data, and gender data for category will be made.
datetime_cols = ['start_time','end_time']
id_cols =['start_station_id','end_station_id','bike_id']
for i in datetime_cols:
clean_df[i] = pd.to_datetime(clean_df[i])
for i in id_cols:
clean_df[i] = clean_df[i].astype(str)
clean_df = clean_df[(clean_df['member_gender'].notnull())]
clean_df = clean_df[(clean_df.member_birth_year.notnull())]
clean_df.member_gender = clean_df.member_gender.astype('category')
clean_df.member_birth_year = clean_df.member_birth_year.astype('int')
Test
clean_df[id_cols].dtypes
start_station_id object end_station_id object bike_id object dtype: object
clean_df[datetime_cols].dtypes
start_time datetime64[ns] end_time datetime64[ns] dtype: object
clean_df[['member_gender','member_birth_year']].dtypes
member_gender category member_birth_year int32 dtype: object
clean_df['hour_value'] = pd.DatetimeIndex(clean_df.start_time).hour
clean_df['day_week'] = pd.DatetimeIndex(clean_df.start_time).dayofweek
clean_df['day'] = pd.DatetimeIndex(clean_df.start_time).day
We will remove rows that are Null and will not interfere with the study, since a wrong fill can cause deviations in the final results. The distributions of the ages will then be evaluated and we will finally remove the outliers
plt.boxplot(clean_df.member_birth_year)
plt.ylabel('Birth Year');
#Bases in boxplot lets remove all rows under 1940 (80 years).
lower_range = 1940
clean_df = clean_df[(clean_df.member_birth_year > lower_range)]
plt.boxplot(clean_df.member_birth_year)
plt.ylabel('Birth Year');
We will now plot the stations on the map and evaluate the null data that was found.
clean_df = clean_df[~(clean_df.member_birth_year.isna())]
plt.boxplot(clean_df.member_birth_year)
plt.ylabel('Birth Year');
#Bases in boxplot lets remove all rows under 1940 (80 years).
lower_range = 1940
clean_df = clean_df[(clean_df.member_birth_year > lower_range)]
plt.boxplot(clean_df.member_birth_year)
plt.ylabel('Birth Year');
Test
clean_df.describe()
| duration_sec | start_station_latitude | start_station_longitude | end_station_latitude | end_station_longitude | member_birth_year | hour_value | day_week | day | |
|---|---|---|---|---|---|---|---|---|---|
| count | 174944.000000 | 174944.000000 | 174944.000000 | 174944.000000 | 174944.000000 | 174944.000000 | 174944.000000 | 174944.000000 | 174944.000000 |
| mean | 704.509574 | 37.770794 | -122.351263 | 37.770989 | -122.350839 | 1984.880888 | 13.456237 | 2.620101 | 15.312443 |
| std | 1642.477572 | 0.101124 | 0.118499 | 0.101031 | 0.118064 | 9.869501 | 4.734634 | 1.808562 | 8.034238 |
| min | 61.000000 | 37.317298 | -122.453704 | 37.317298 | -122.453704 | 1941.000000 | 0.000000 | 0.000000 | 1.000000 |
| 25% | 323.000000 | 37.770407 | -122.411901 | 37.770407 | -122.411647 | 1980.000000 | 9.000000 | 1.000000 | 8.000000 |
| 50% | 511.000000 | 37.780760 | -122.398279 | 37.781010 | -122.397405 | 1987.000000 | 14.000000 | 3.000000 | 15.000000 |
| 75% | 789.000000 | 37.797320 | -122.282497 | 37.797673 | -122.285171 | 1992.000000 | 17.000000 | 4.000000 | 22.000000 |
| max | 84548.000000 | 37.880222 | -121.874119 | 37.880222 | -121.874119 | 2001.000000 | 23.000000 | 6.000000 | 28.000000 |
In this session we will investigate the integrity of the longitude and latitude variables and how are the map distributions.
First I will plot all stations on the map and evaluate the case of stations that are without the ID and without the name information, but are lat,long informed.
#Lets check all stations and get info about data missing
mask = (clean_df.start_station_name.notnull() | clean_df.start_station_name.notnull())
station_null_df = clean_df[~mask]
station_notnull_df = clean_df[mask]
#Lets plot all stations and plot missing
fig = go.Figure()
fig.add_trace(go.Scattermapbox(
lat=station_null_df.start_station_latitude,
lon=station_null_df.start_station_longitude,
mode='markers',
marker=go.scattermapbox.Marker(
size=15
),
text=['Null Start Data'],
name='Null Start Data'
))
fig.add_trace(go.Scattermapbox(
lat=station_null_df.end_station_latitude,
lon=station_null_df.end_station_longitude,
mode='markers',
marker=go.scattermapbox.Marker(
size=10
),
text=['Null End Data'],
name='Null End Data'
))
fig.add_trace(go.Scattermapbox(
lat=station_notnull_df.start_station_latitude,
lon=station_notnull_df.start_station_longitude,
mode='markers',
marker=go.scattermapbox.Marker(
size=15
),
text=['Not Null start Data'],
name='Not Null start Data'
))
fig.add_trace(go.Scattermapbox(
lat=station_notnull_df.end_station_latitude,
lon=station_notnull_df.end_station_longitude,
mode='markers',
marker=go.scattermapbox.Marker(
size=13
),
text=['Not Null end Data'],
name='Not Null end Data'
))
fig.update_layout(
hovermode='closest',
mapbox=dict(
style='carto-positron',
bearing=0,
center=go.layout.mapbox.Center(
lat=37.6,
lon=-122.1
),
pitch=0,
zoom=8
)
)
fig.show()